Import Libraries
In [2]:
# Import required libraries or modules
import numpy as np
# Import required libraries or modules
import pandas as pd
# Import required libraries or modules
import matplotlib.pyplot as plt
# Import required libraries or modules
import seaborn as sns
# Import required libraries or modules
import warnings
# Filter rows based on a condition
warnings.filterwarnings('ignore')
# Generate a plot or visualize data
%matplotlib inline
In [3]:
#1)Import Data
In [4]:
# Load data into the DataFrame or variable
taxi_sep_2024=pd.read_parquet('/Users/udaykola/Downloads/yellow_tripdata_2024-09.parquet')
taxi_data=pd.concat([taxi_sep_2024])
In [5]:
taxi_data.head()
Out[5]:
| VendorID | tpep_pickup_datetime | tpep_dropoff_datetime | passenger_count | trip_distance | RatecodeID | store_and_fwd_flag | PULocationID | DOLocationID | payment_type | fare_amount | extra | mta_tax | tip_amount | tolls_amount | improvement_surcharge | total_amount | congestion_surcharge | Airport_fee | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 2024-09-01 00:05:51 | 2024-09-01 00:45:03 | 1.0 | 9.80 | 1.0 | N | 138 | 48 | 1 | 47.8 | 10.25 | 0.5 | 13.30 | 6.94 | 1.0 | 79.79 | 2.5 | 1.75 |
| 1 | 1 | 2024-09-01 00:59:35 | 2024-09-01 01:03:43 | 1.0 | 0.50 | 1.0 | N | 140 | 141 | 1 | 5.1 | 3.50 | 0.5 | 3.00 | 0.00 | 1.0 | 13.10 | 2.5 | 0.00 |
| 2 | 2 | 2024-09-01 00:25:00 | 2024-09-01 00:34:37 | 2.0 | 2.29 | 1.0 | N | 238 | 152 | 2 | 13.5 | 1.00 | 0.5 | 0.00 | 0.00 | 1.0 | 16.00 | 0.0 | 0.00 |
| 3 | 2 | 2024-09-01 00:31:00 | 2024-09-01 00:46:52 | 1.0 | 5.20 | 1.0 | N | 93 | 130 | 1 | 24.7 | 1.00 | 0.5 | 4.55 | 0.00 | 1.0 | 31.75 | 0.0 | 0.00 |
| 4 | 2 | 2024-09-01 00:11:57 | 2024-09-01 00:30:41 | 2.0 | 2.26 | 1.0 | N | 79 | 231 | 1 | 17.0 | 1.00 | 0.5 | 4.40 | 0.00 | 1.0 | 26.40 | 2.5 | 0.00 |
In [6]:
taxi_data.shape
Out[6]:
(3633030, 19)
2)DAta Exploration¶
In [8]:
taxi_data.columns
Out[8]:
Index(['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime',
'passenger_count', 'trip_distance', 'RatecodeID', 'store_and_fwd_flag',
'PULocationID', 'DOLocationID', 'payment_type', 'fare_amount', 'extra',
'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge',
'total_amount', 'congestion_surcharge', 'Airport_fee'],
dtype='object')
In [9]:
taxi_data = taxi_data[['tpep_pickup_datetime', 'tpep_dropoff_datetime',
'passenger_count', 'trip_distance', 'RatecodeID','PULocationID',
'DOLocationID', 'payment_type', 'total_amount']]
In [10]:
taxi_data.head()
Out[10]:
| tpep_pickup_datetime | tpep_dropoff_datetime | passenger_count | trip_distance | RatecodeID | PULocationID | DOLocationID | payment_type | total_amount | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 2024-09-01 00:05:51 | 2024-09-01 00:45:03 | 1.0 | 9.80 | 1.0 | 138 | 48 | 1 | 79.79 |
| 1 | 2024-09-01 00:59:35 | 2024-09-01 01:03:43 | 1.0 | 0.50 | 1.0 | 140 | 141 | 1 | 13.10 |
| 2 | 2024-09-01 00:25:00 | 2024-09-01 00:34:37 | 2.0 | 2.29 | 1.0 | 238 | 152 | 2 | 16.00 |
| 3 | 2024-09-01 00:31:00 | 2024-09-01 00:46:52 | 1.0 | 5.20 | 1.0 | 93 | 130 | 1 | 31.75 |
| 4 | 2024-09-01 00:11:57 | 2024-09-01 00:30:41 | 2.0 | 2.26 | 1.0 | 79 | 231 | 1 | 26.40 |
In [11]:
taxi_data.shape
Out[11]:
(3633030, 9)
In [12]:
taxi_data.hist(figsize=(20,10),bins=60)
# Display the result or DataFrame contents
plt.show()
In [13]:
taxi_data[taxi_data['total_amount']<0].shape
Out[13]:
(56109, 9)
In [14]:
# Generate a plot or visualize data
taxi_data.reset_index().plot(kind='scatter',x='index',y='total_amount',figsize=(10,5))
# Display the result or DataFrame contents
plt.show()
In [15]:
# Generate a plot or visualize data
taxi_data[taxi_data['total_amount'] <1000].reset_index().plot(
kind='scatter', x='index', y='total_amount', figsize=(10, 5))
# Display the result or DataFrame contents
plt.show()
In [16]:
# Generate a plot or visualize data
taxi_data[taxi_data['total_amount']<0].reset_index().plot(kind='scatter',x='index',y='total_amount',figsize=(10,5))
# Display the result or DataFrame contents
plt.show()
In [17]:
taxi_data[taxi_data['total_amount']<0]['payment_type'].value_counts()
Out[17]:
payment_type 4 34193 2 13540 3 8016 0 348 1 12 Name: count, dtype: int64
In [18]:
# Import required libraries or modules
import matplotlib.pyplot as plt
# Plotting the histogram
taxi_data[taxi_data['total_amount'] < 0]['trip_distance'].hist(
bins=60, figsize=(10, 5), color='blue', alpha=0.7
)
# Adding labels and title
plt.title("Histogram of Trip Distance for Negative Total Amounts", fontsize=14)
plt.xlabel("Trip Distance", fontsize=12)
plt.ylabel("Frequency", fontsize=12)
# Generate a plot or visualize data
# Display the plot
# Display the result or DataFrame contents
plt.show()
In [19]:
taxi_data[taxi_data['total_amount']==0].shape
Out[19]:
(431, 9)
In [20]:
taxi_data[taxi_data['total_amount']>200].shape
Out[20]:
(2983, 9)
In [21]:
taxi_data[taxi_data['total_amount']>350].shape
Out[21]:
(440, 9)
In [22]:
taxi_data['total_amount'].mean()
Out[22]:
28.539097904503926
In [23]:
#3)Data Cldeaning
# Filter rows based on a condition
taxi_data_filtered=taxi_data[(taxi_data['total_amount']>0) & (taxi_data['total_amount']<350)]
# Filter rows based on a condition
taxi_data_filtered.head()
Out[23]:
| tpep_pickup_datetime | tpep_dropoff_datetime | passenger_count | trip_distance | RatecodeID | PULocationID | DOLocationID | payment_type | total_amount | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 2024-09-01 00:05:51 | 2024-09-01 00:45:03 | 1.0 | 9.80 | 1.0 | 138 | 48 | 1 | 79.79 |
| 1 | 2024-09-01 00:59:35 | 2024-09-01 01:03:43 | 1.0 | 0.50 | 1.0 | 140 | 141 | 1 | 13.10 |
| 2 | 2024-09-01 00:25:00 | 2024-09-01 00:34:37 | 2.0 | 2.29 | 1.0 | 238 | 152 | 2 | 16.00 |
| 3 | 2024-09-01 00:31:00 | 2024-09-01 00:46:52 | 1.0 | 5.20 | 1.0 | 93 | 130 | 1 | 31.75 |
| 4 | 2024-09-01 00:11:57 | 2024-09-01 00:30:41 | 2.0 | 2.26 | 1.0 | 79 | 231 | 1 | 26.40 |
In [24]:
taxi_data.shape
Out[24]:
(3633030, 9)
In [25]:
# Filter rows based on a condition
taxi_data_filtered.shape
Out[25]:
(3576050, 9)
In [26]:
# Filter rows based on a condition
taxi_data_filtered.isnull().sum()
Out[26]:
tpep_pickup_datetime 0 tpep_dropoff_datetime 0 passenger_count 483316 trip_distance 0 RatecodeID 483316 PULocationID 0 DOLocationID 0 payment_type 0 total_amount 0 dtype: int64
In [27]:
# Filter rows based on a condition
taxi_data_filtered['passenger_count'].mean()
Out[27]:
1.3021578965407306
In [28]:
# Filter rows based on a condition
taxi_data_filtered['RatecodeID'].mean()
Out[28]:
2.283585009250715
In [29]:
# Filter rows based on a condition
taxi_data_filtered[taxi_data_filtered['passenger_count'].isnull()].reset_index().plot(
kind='scatter',x='index',y='total_amount',figsize=(10,5))
# Display the result or DataFrame contents
plt.show()
In [30]:
# Filter rows based on a condition
taxi_data_filtered[taxi_data_filtered['passenger_count'].isnull()].reset_index().plot(
kind='scatter',x='index',y='total_amount',figsize=(10,5))
# Display the result or DataFrame contents
plt.show()
In [31]:
# Filter rows based on a condition
taxi_data_filtered=taxi_data_filtered.dropna()
# Filter rows based on a condition
taxi_data_filtered.shape
Out[31]:
(3092734, 9)
In [32]:
#4)data Preparation
# Filter rows based on a condition
taxi_data_prepared=taxi_data_filtered.copy()
In [33]:
taxi_data_prepared.dtypes
Out[33]:
tpep_pickup_datetime datetime64[us] tpep_dropoff_datetime datetime64[us] passenger_count float64 trip_distance float64 RatecodeID float64 PULocationID int32 DOLocationID int32 payment_type int64 total_amount float64 dtype: object
In [34]:
taxi_data_prepared.loc[:,'RatecodeID']=taxi_data_prepared['RatecodeID'].astype(str)
taxi_data_prepared.loc[:,'PULocationID']=taxi_data_prepared['PULocationID'].astype(str)
taxi_data_prepared.loc[:,'DOLocationID']=taxi_data_prepared['DOLocationID'].astype(str)
taxi_data_prepared.loc[:,'payment_type']=taxi_data_prepared['payment_type'].astype(str)
In [35]:
taxi_data_prepared.dtypes
Out[35]:
tpep_pickup_datetime datetime64[us] tpep_dropoff_datetime datetime64[us] passenger_count float64 trip_distance float64 RatecodeID object PULocationID object DOLocationID object payment_type object total_amount float64 dtype: object
In [36]:
taxi_data_prepared.head()
Out[36]:
| tpep_pickup_datetime | tpep_dropoff_datetime | passenger_count | trip_distance | RatecodeID | PULocationID | DOLocationID | payment_type | total_amount | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 2024-09-01 00:05:51 | 2024-09-01 00:45:03 | 1.0 | 9.80 | 1.0 | 138 | 48 | 1 | 79.79 |
| 1 | 2024-09-01 00:59:35 | 2024-09-01 01:03:43 | 1.0 | 0.50 | 1.0 | 140 | 141 | 1 | 13.10 |
| 2 | 2024-09-01 00:25:00 | 2024-09-01 00:34:37 | 2.0 | 2.29 | 1.0 | 238 | 152 | 2 | 16.00 |
| 3 | 2024-09-01 00:31:00 | 2024-09-01 00:46:52 | 1.0 | 5.20 | 1.0 | 93 | 130 | 1 | 31.75 |
| 4 | 2024-09-01 00:11:57 | 2024-09-01 00:30:41 | 2.0 | 2.26 | 1.0 | 79 | 231 | 1 | 26.40 |
In [37]:
taxi_data_prepared['transaction_date']=pd.to_datetime(taxi_data_prepared['tpep_pickup_datetime'].dt.date)
taxi_data_prepared['transaction_year']=taxi_data_prepared['tpep_pickup_datetime'].dt.year
taxi_data_prepared['transaction_month']=taxi_data_prepared['tpep_pickup_datetime'].dt.month
taxi_data_prepared['transaction_day']=taxi_data_prepared['tpep_pickup_datetime'].dt.day
taxi_data_prepared['transaction_hour']=taxi_data_prepared['tpep_pickup_datetime'].dt.hour
In [38]:
taxi_data_prepared.head()
Out[38]:
| tpep_pickup_datetime | tpep_dropoff_datetime | passenger_count | trip_distance | RatecodeID | PULocationID | DOLocationID | payment_type | total_amount | transaction_date | transaction_year | transaction_month | transaction_day | transaction_hour | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2024-09-01 00:05:51 | 2024-09-01 00:45:03 | 1.0 | 9.80 | 1.0 | 138 | 48 | 1 | 79.79 | 2024-09-01 | 2024 | 9 | 1 | 0 |
| 1 | 2024-09-01 00:59:35 | 2024-09-01 01:03:43 | 1.0 | 0.50 | 1.0 | 140 | 141 | 1 | 13.10 | 2024-09-01 | 2024 | 9 | 1 | 0 |
| 2 | 2024-09-01 00:25:00 | 2024-09-01 00:34:37 | 2.0 | 2.29 | 1.0 | 238 | 152 | 2 | 16.00 | 2024-09-01 | 2024 | 9 | 1 | 0 |
| 3 | 2024-09-01 00:31:00 | 2024-09-01 00:46:52 | 1.0 | 5.20 | 1.0 | 93 | 130 | 1 | 31.75 | 2024-09-01 | 2024 | 9 | 1 | 0 |
| 4 | 2024-09-01 00:11:57 | 2024-09-01 00:30:41 | 2.0 | 2.26 | 1.0 | 79 | 231 | 1 | 26.40 | 2024-09-01 | 2024 | 9 | 1 | 0 |
In [39]:
taxi_data_prepared['transaction_year'].unique()
Out[39]:
array([2024, 2008, 2009], dtype=int32)
In [40]:
taxi_data_prepared[taxi_data_prepared['transaction_year']!=2024]['payment_type'].value_counts()
Out[40]:
payment_type 1 2 2 1 Name: count, dtype: int64
In [41]:
taxi_data_prepared[taxi_data_prepared['transaction_month']!=9]['payment_type'].value_counts()
Out[41]:
payment_type 1 38 2 9 4 1 Name: count, dtype: int64
In [ ]:
In [42]:
taxi_data_prepared=taxi_data_prepared[taxi_data_prepared['transaction_year']==2024]
taxi_data_prepared=taxi_data_prepared[taxi_data_prepared['transaction_month']==9]
In [43]:
taxi_data_prepared.shape
Out[43]:
(3092686, 14)
In [44]:
#Noting down the categorical and numerical columns
taxi_data_prepared=taxi_data_prepared[taxi_data_prepared['passenger_count']>0]
taxi_data_prepared.shape
Out[44]:
(3062367, 14)
In [45]:
taxi_data_prepared.head()
Out[45]:
| tpep_pickup_datetime | tpep_dropoff_datetime | passenger_count | trip_distance | RatecodeID | PULocationID | DOLocationID | payment_type | total_amount | transaction_date | transaction_year | transaction_month | transaction_day | transaction_hour | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2024-09-01 00:05:51 | 2024-09-01 00:45:03 | 1.0 | 9.80 | 1.0 | 138 | 48 | 1 | 79.79 | 2024-09-01 | 2024 | 9 | 1 | 0 |
| 1 | 2024-09-01 00:59:35 | 2024-09-01 01:03:43 | 1.0 | 0.50 | 1.0 | 140 | 141 | 1 | 13.10 | 2024-09-01 | 2024 | 9 | 1 | 0 |
| 2 | 2024-09-01 00:25:00 | 2024-09-01 00:34:37 | 2.0 | 2.29 | 1.0 | 238 | 152 | 2 | 16.00 | 2024-09-01 | 2024 | 9 | 1 | 0 |
| 3 | 2024-09-01 00:31:00 | 2024-09-01 00:46:52 | 1.0 | 5.20 | 1.0 | 93 | 130 | 1 | 31.75 | 2024-09-01 | 2024 | 9 | 1 | 0 |
| 4 | 2024-09-01 00:11:57 | 2024-09-01 00:30:41 | 2.0 | 2.26 | 1.0 | 79 | 231 | 1 | 26.40 | 2024-09-01 | 2024 | 9 | 1 | 0 |
In [46]:
categorical_columns=['PULocationID','transaction_date','transaction_month','transaction_day','transaction_hour']
numerical_columns=['trip_distance','total_amount']
all_needed_columns=categorical_columns+numerical_columns
In [47]:
main_taxi_df=taxi_data_prepared[all_needed_columns]
print(main_taxi_df.shape)
main_taxi_df.head()
(3062367, 7)
Out[47]:
| PULocationID | transaction_date | transaction_month | transaction_day | transaction_hour | trip_distance | total_amount | |
|---|---|---|---|---|---|---|---|
| 0 | 138 | 2024-09-01 | 9 | 1 | 0 | 9.80 | 79.79 |
| 1 | 140 | 2024-09-01 | 9 | 1 | 0 | 0.50 | 13.10 |
| 2 | 238 | 2024-09-01 | 9 | 1 | 0 | 2.29 | 16.00 |
| 3 | 93 | 2024-09-01 | 9 | 1 | 0 | 5.20 | 31.75 |
| 4 | 79 | 2024-09-01 | 9 | 1 | 0 | 2.26 | 26.40 |
In [48]:
# Reset the index of the DataFrame to a default integer-based index
taxi_grouped_by_region=main_taxi_df.groupby(categorical_columns).mean().reset_index()
taxi_grouped_by_region['count_of_transactions'] = main_taxi_df.groupby(
# Reset the index of the DataFrame to a default integer-based index
categorical_columns).count().reset_index()['total_amount']
print(taxi_grouped_by_region.shape)
taxi_grouped_by_region.head()
#Aggregated the columns for the efficiency
(76481, 8)
Out[48]:
| PULocationID | transaction_date | transaction_month | transaction_day | transaction_hour | trip_distance | total_amount | count_of_transactions | |
|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 2024-09-01 | 9 | 1 | 5 | 0.00 | 116.00 | 1 |
| 1 | 1 | 2024-09-01 | 9 | 1 | 8 | 12.74 | 52.50 | 1 |
| 2 | 1 | 2024-09-01 | 9 | 1 | 10 | 0.04 | 121.00 | 1 |
| 3 | 1 | 2024-09-01 | 9 | 1 | 11 | 15.63 | 141.96 | 1 |
| 4 | 1 | 2024-09-01 | 9 | 1 | 13 | 0.00 | 81.01 | 1 |
In [49]:
#model building
data_for_benchmark_model=taxi_grouped_by_region.copy()
In [50]:
categorical_features_benchmark=['PULocationID','transaction_month','transaction_day','transaction_hour']
input_features_benchmark=categorical_features_benchmark+['trip_distance']
target_feature_benchmark='total_amount'
In [51]:
#train model and test model
In [52]:
# Import required libraries or modules
from sklearn.model_selection import train_test_split
X_bench=data_for_benchmark_model[input_features_benchmark]
y_bench=data_for_benchmark_model[target_feature_benchmark]
X_bench=pd.get_dummies(X_bench)
X_train,X_test,y_train,y_test=train_test_split(X_bench,y_bench,test_size=0.2,random_state=10)
In [53]:
X_bench.head()
Out[53]:
| transaction_month | transaction_day | transaction_hour | trip_distance | PULocationID_1 | PULocationID_10 | PULocationID_100 | PULocationID_101 | PULocationID_102 | PULocationID_106 | ... | PULocationID_9 | PULocationID_90 | PULocationID_91 | PULocationID_92 | PULocationID_93 | PULocationID_94 | PULocationID_95 | PULocationID_96 | PULocationID_97 | PULocationID_98 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 9 | 1 | 5 | 0.00 | True | False | False | False | False | False | ... | False | False | False | False | False | False | False | False | False | False |
| 1 | 9 | 1 | 8 | 12.74 | True | False | False | False | False | False | ... | False | False | False | False | False | False | False | False | False | False |
| 2 | 9 | 1 | 10 | 0.04 | True | False | False | False | False | False | ... | False | False | False | False | False | False | False | False | False | False |
| 3 | 9 | 1 | 11 | 15.63 | True | False | False | False | False | False | ... | False | False | False | False | False | False | False | False | False | False |
| 4 | 9 | 1 | 13 | 0.00 | True | False | False | False | False | False | ... | False | False | False | False | False | False | False | False | False | False |
5 rows × 258 columns
In [54]:
#Fitting the model to the data
#decision tree Model
In [55]:
# Import required libraries or modules
from sklearn.tree import DecisionTreeRegressor
tree=DecisionTreeRegressor(max_depth=10,random_state=10)
tree.fit(X_train,y_train)
tree.score(X_test,y_test)
Out[55]:
0.5603395914225617
In [56]:
# Import required libraries or modules
#Performing Principal Component Analysis It ain't improving
from sklearn.decomposition import PCA
pca=PCA(n_components=200,random_state=10)
X_train_pca=pca.fit_transform(X_train)
X_test_pca=pca.transform(X_test)
In [57]:
tree.fit(X_train_pca,y_train)
tree.score(X_test_pca,y_test)
Out[57]:
0.5562141491603332
In [58]:
# Import required libraries or modules
from sklearn.model_selection import cross_val_score
# Performing Cross Evaluation
score=cross_val_score(tree,X_bench,y_bench,cv=5)
print(score)
np.average(score)
# the scores are low and it indicates , the model is not perfect
[0.4568568 0.34361443 0.42339666 0.30446022 0.41343122]
Out[58]:
0.38835186325955096
In [59]:
# Import required libraries or modules
#Performing Random Forest Regression
from sklearn.ensemble import RandomForestRegressor
model=RandomForestRegressor(max_depth=100)
In [60]:
model.fit(X_train,y_train)
model.score(X_test,y_test)
Out[60]:
0.6218083550457949
In [61]:
#6)feature engineering
taxi_grouped_by_region.head()
Out[61]:
| PULocationID | transaction_date | transaction_month | transaction_day | transaction_hour | trip_distance | total_amount | count_of_transactions | |
|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 2024-09-01 | 9 | 1 | 5 | 0.00 | 116.00 | 1 |
| 1 | 1 | 2024-09-01 | 9 | 1 | 8 | 12.74 | 52.50 | 1 |
| 2 | 1 | 2024-09-01 | 9 | 1 | 10 | 0.04 | 121.00 | 1 |
| 3 | 1 | 2024-09-01 | 9 | 1 | 11 | 15.63 | 141.96 | 1 |
| 4 | 1 | 2024-09-01 | 9 | 1 | 13 | 0.00 | 81.01 | 1 |
In [62]:
data_with_new_features=taxi_grouped_by_region.copy()
In [63]:
#Merging the Data
data_with_new_features['transaction_week_day']=data_with_new_features['transaction_date'].dt.weekday
# Monday starts with 0
data_with_new_features['weekend']=data_with_new_features['transaction_week_day'].apply(
lambda x:True if x==5 or x==6 else False)
In [64]:
# Import required libraries or modules
from pandas.tseries.holiday import USFederalHolidayCalendar
#In federal calendar merging with the dates
cal=USFederalHolidayCalendar()
holidays=cal.holidays(start='2023',end='2025').date
data_with_new_features['is_holiday']=data_with_new_features['transaction_date'].isin(holidays)
In [65]:
data_with_new_features.head()
Out[65]:
| PULocationID | transaction_date | transaction_month | transaction_day | transaction_hour | trip_distance | total_amount | count_of_transactions | transaction_week_day | weekend | is_holiday | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 2024-09-01 | 9 | 1 | 5 | 0.00 | 116.00 | 1 | 6 | True | False |
| 1 | 1 | 2024-09-01 | 9 | 1 | 8 | 12.74 | 52.50 | 1 | 6 | True | False |
| 2 | 1 | 2024-09-01 | 9 | 1 | 10 | 0.04 | 121.00 | 1 | 6 | True | False |
| 3 | 1 | 2024-09-01 | 9 | 1 | 11 | 15.63 | 141.96 | 1 | 6 | True | False |
| 4 | 1 | 2024-09-01 | 9 | 1 | 13 | 0.00 | 81.01 | 1 | 6 | True | False |
In [66]:
# Load data into the DataFrame or variable
#Combining the data to the old dataset, adding to the location zones
zone_lookup=pd.read_csv('/Users/udaykola/Downloads/taxi-zone-lookup.csv')
zone_lookup=zone_lookup[['LocationID','Borough']]
zone_lookup['LocationID']=zone_lookup['LocationID'].astype(str)
zone_lookup.head()
Out[66]:
| LocationID | Borough | |
|---|---|---|
| 0 | 1 | EWR |
| 1 | 2 | Queens |
| 2 | 3 | Bronx |
| 3 | 4 | Manhattan |
| 4 | 5 | Staten Island |
In [67]:
# Merge or join two DataFrames
data_with_new_features=data_with_new_features.merge(
zone_lookup,left_on='PULocationID',right_on='LocationID',how='left')
data_with_new_features.head()
Out[67]:
| PULocationID | transaction_date | transaction_month | transaction_day | transaction_hour | trip_distance | total_amount | count_of_transactions | transaction_week_day | weekend | is_holiday | LocationID | Borough | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 2024-09-01 | 9 | 1 | 5 | 0.00 | 116.00 | 1 | 6 | True | False | 1 | EWR |
| 1 | 1 | 2024-09-01 | 9 | 1 | 8 | 12.74 | 52.50 | 1 | 6 | True | False | 1 | EWR |
| 2 | 1 | 2024-09-01 | 9 | 1 | 10 | 0.04 | 121.00 | 1 | 6 | True | False | 1 | EWR |
| 3 | 1 | 2024-09-01 | 9 | 1 | 11 | 15.63 | 141.96 | 1 | 6 | True | False | 1 | EWR |
| 4 | 1 | 2024-09-01 | 9 | 1 | 13 | 0.00 | 81.01 | 1 | 6 | True | False | 1 | EWR |
In [68]:
data_with_new_features['Borough'].value_counts()
# Import required libraries or modules
import plotly.express as px
# Import required libraries or modules
import pandas as pd
# Data
borough_data = {
'Borough': ['Manhattan', 'Brooklyn', 'Queens', 'Bronx', 'Unknown', 'EWR', 'Staten Island'],
'Pickup_Count': [41057, 14587, 14480, 4700, 1301, 273, 83]
}
# Create or manipulate a DataFrame
# Create a DataFrame
# Create or manipulate a DataFrame
df = pd.DataFrame(borough_data)
# Create the bar chart
fig = px.bar(
df,
x='Borough',
y='Pickup_Count',
title='Most Picked-Up Locations by Borough',
labels={'Pickup_Count': 'Number of Pickups'},
text='Pickup_Count',
color='Pickup_Count', # Add color to differentiate values
color_continuous_scale='Viridis' # Use a visually appealing color scale
)
# Customize the layout
fig.update_traces(
textposition='outside',
marker_line_width=1.5,
marker_line_color='black'
)
fig.update_layout(
xaxis_title='Borough',
yaxis_title='Number of Pickups',
title_x=0.5,
title_font_size=20,
xaxis_tickangle=-45, # Rotate x-axis labels for better visibility
height=600, # Increase the height of the chart
width=1000, # Increase the width of the chart
font=dict(size=14),
# Load data into the DataFrame or variable
margin=dict(l=50, r=50, t=80, b=150), # Adjust margins for readability
)
# Show the chart
# Display the result or DataFrame contents
fig.show()
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [69]:
pip install selenium
Requirement already satisfied: selenium in /opt/anaconda3/lib/python3.12/site-packages (4.27.1) Requirement already satisfied: urllib3<3,>=1.26 in /opt/anaconda3/lib/python3.12/site-packages (from urllib3[socks]<3,>=1.26->selenium) (2.2.3) Requirement already satisfied: trio~=0.17 in /opt/anaconda3/lib/python3.12/site-packages (from selenium) (0.27.0) Requirement already satisfied: trio-websocket~=0.9 in /opt/anaconda3/lib/python3.12/site-packages (from selenium) (0.11.1) Requirement already satisfied: certifi>=2021.10.8 in /opt/anaconda3/lib/python3.12/site-packages (from selenium) (2024.8.30) Requirement already satisfied: typing_extensions~=4.9 in /opt/anaconda3/lib/python3.12/site-packages (from selenium) (4.11.0) Requirement already satisfied: websocket-client~=1.8 in /opt/anaconda3/lib/python3.12/site-packages (from selenium) (1.8.0) Requirement already satisfied: attrs>=23.2.0 in /opt/anaconda3/lib/python3.12/site-packages (from trio~=0.17->selenium) (24.2.0) Requirement already satisfied: sortedcontainers in /opt/anaconda3/lib/python3.12/site-packages (from trio~=0.17->selenium) (2.4.0) Requirement already satisfied: idna in /opt/anaconda3/lib/python3.12/site-packages (from trio~=0.17->selenium) (3.7) Requirement already satisfied: outcome in /opt/anaconda3/lib/python3.12/site-packages (from trio~=0.17->selenium) (1.3.0.post0) Requirement already satisfied: sniffio>=1.3.0 in /opt/anaconda3/lib/python3.12/site-packages (from trio~=0.17->selenium) (1.3.0) Requirement already satisfied: wsproto>=0.14 in /opt/anaconda3/lib/python3.12/site-packages (from trio-websocket~=0.9->selenium) (1.2.0) Requirement already satisfied: pysocks!=1.5.7,<2.0,>=1.5.6 in /opt/anaconda3/lib/python3.12/site-packages (from urllib3[socks]<3,>=1.26->selenium) (1.7.1) Requirement already satisfied: h11<1,>=0.9.0 in /opt/anaconda3/lib/python3.12/site-packages (from wsproto>=0.14->trio-websocket~=0.9->selenium) (0.14.0) Note: you may need to restart the kernel to use updated packages.
In [70]:
# Import required libraries or modules
# from selenium import webdriver
# Import required libraries or modules
# from selenium.webdriver.common.by import By
# Import required libraries or modules
# import pandas as pd
# Import required libraries or modules
# import time
# # Set up Selenium WebDriver
# options = webdriver.ChromeOptions()
# options.add_argument('--headless') # Run in headless mode (no browser UI)
# driver = webdriver.Chrome(options=options)
# # Open the website
# url = "https://www.wunderground.com/history/monthly/us/ny/new-york-city/KLGA/date/2024-1"
# driver.get(url)
# Load data into the DataFrame or variable
# time.sleep(5) # Wait for the page to load
# # Locate the table
# table = driver.find_element(By.CLASS_NAME, "days") # Adjust as needed
# # Extract rows
# rows = table.find_elements(By.TAG_NAME, "tr")
# data = []
# for row in rows[1:]: # Skip header row
# cols = row.find_elements(By.TAG_NAME, "td")
# if len(cols) > 1:
# date = cols[0].text.strip()
# avg_temp = cols[2].text.strip() # Assuming "Avg" is the 3rd column
# data.append([date, avg_temp])
# Create or manipulate a DataFrame
# # Create a DataFrame and save to CSV
# Create or manipulate a DataFrame
# df = pd.DataFrame(data, columns=["Date", "Average Temperature"])
# df.to_csv("average_daily_temperatures.csv", index=False)
# # Clean up
# driver.quit()
# print("Data saved to 'average_daily_temperatures.csv'.")
In [71]:
# Import required libraries or modules
import pandas as pd
# Read the CSV file with the specified encoding
# Load data into the DataFrame or variable
nyc_weather = pd.read_csv('/Users/udaykola/Downloads/sep_weather_data.csv', encoding='ISO-8859-1')
# Clean the dataset by removing the 'Ê' character from all string columns
nyc_weather = nyc_weather.applymap(lambda x: x.replace('Ê', '').strip() if isinstance(x, str) else x)
# Export the cleaned dataset to a new CSV file
# Load data into the DataFrame or variable
cleaned_file_path = '/Users/udaykola/Downloads/sep_weather_data_cleaned.csv'
nyc_weather.to_csv(cleaned_file_path, index=False, encoding='utf-8')
# Display the cleaned dataset's head
print("Cleaned data preview:")
print(nyc_weather.head())
print(f"Cleaned data has been saved to: {cleaned_file_path}")
Cleaned data preview:
Date Temperature Dew Point Humidity Wind speed(mph) Pressure (in) \
0 Sep Avg Avg Avg Avg Avg
1 1-Sep-2024 25.22 19.89 72.6 7.6 29.9
2 2-Sep-2024 22.61 11.11 49.2 13.2 30.0
3 3-Sep-2024 19.33 5.56 41.8 8.3 30.3
4 4-Sep-2024 21 8.5 45.7 6.0 30.4
Precipitation
0 Total
1 0.00
2 0.00
3 0.00
4 0.00
Cleaned data has been saved to: /Users/udaykola/Downloads/sep_weather_data_cleaned.csv
In [72]:
nyc_weather['Precipitation'].unique()
Out[72]:
array(['Total', '0.00', '0.32', '0.01', '0.07', '0.05', '0.22', '0.64'],
dtype=object)
In [73]:
nyc_weather['Humidity'].unique()
Out[73]:
array(['Avg', '72.6', '49.2', '41.8', '45.7', '58.7', '73.5', '74.2',
'48.6', '40.1', '45.3', '47.2', '67.2', '61.6', '66.0', '65.8',
'61.7', '70.4', '76.3', '54.1', '62.4', '60.2', '62.2', '63.9',
'83.4', '74.7', '83.0', '82.7', '69.7'], dtype=object)
In [74]:
# Ensure the Precipitation column is numeric
nyc_weather['Precipitation'] = pd.to_numeric(nyc_weather['Precipitation'], errors='coerce')
# Fill NaN values (if any) with 0 or another default value
nyc_weather['Precipitation'] = nyc_weather['Precipitation'].fillna(0)
# Apply the weather type logic
nyc_weather['Weather_Type'] = nyc_weather['Precipitation'].apply(
lambda x: 'sunny' if x == 0 else ('cloudy' if x < 0.1 else 'rainy')
)
# Preview the results
print(nyc_weather[['Precipitation', 'Weather_Type']].head())
Precipitation Weather_Type 0 0.0 sunny 1 0.0 sunny 2 0.0 sunny 3 0.0 sunny 4 0.0 sunny
In [75]:
# Inspect the unique values in the Date column
print(nyc_weather['Date'].unique())
# Replace invalid entries like "Sep" with NaN
nyc_weather['Date'] = nyc_weather['Date'].replace(r'^\D+$', None, regex=True)
# Convert to datetime and coerce invalid values to NaT
nyc_weather['Date'] = pd.to_datetime(nyc_weather['Date'], format='%d-%b-%Y', errors='coerce')
# Drop rows where Date is NaT (if needed)
nyc_weather = nyc_weather.dropna(subset=['Date'])
# Extract month and day
nyc_weather['month'] = nyc_weather['Date'].dt.month
nyc_weather['day'] = nyc_weather['Date'].dt.day
# Preview the cleaned dataset
print(nyc_weather[['Date', 'month', 'day']].head())
['Sep' '1-Sep-2024' '2-Sep-2024' '3-Sep-2024' '4-Sep-2024' '5-Sep-2024'
'6-Sep-2024' '7-Sep-2024' '8-Sep-2024' '9-Sep-2024' '10-Sep-2024'
'11-Sep-2024' '12-Sep-2024' '13-Sep-2024' '14-Sep-2024' '15-Sep-2024'
'16-Sep-2024' '17-Sep-2024' '18-Sep-2024' '19-Sep-2024' '20-Sep-2024'
'21-Sep-2024' '22-Sep-2024' '23-Sep-2024' '24-Sep-2024' '25-Sep-2024'
'26-Sep-2024' '27-Sep-2024' '28-Sep-2024' '29-Sep-2024' '30-Sep-2024']
Date month day
1 2024-09-01 9 1
2 2024-09-02 9 2
3 2024-09-03 9 3
4 2024-09-04 9 4
5 2024-09-05 9 5
In [76]:
nyc_weather.head()
Out[76]:
| Date | Temperature | Dew Point | Humidity | Wind speed(mph) | Pressure (in) | Precipitation | Weather_Type | month | day | |
|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 2024-09-01 | 25.22 | 19.89 | 72.6 | 7.6 | 29.9 | 0.0 | sunny | 9 | 1 |
| 2 | 2024-09-02 | 22.61 | 11.11 | 49.2 | 13.2 | 30.0 | 0.0 | sunny | 9 | 2 |
| 3 | 2024-09-03 | 19.33 | 5.56 | 41.8 | 8.3 | 30.3 | 0.0 | sunny | 9 | 3 |
| 4 | 2024-09-04 | 21 | 8.5 | 45.7 | 6.0 | 30.4 | 0.0 | sunny | 9 | 4 |
| 5 | 2024-09-05 | 20.83 | 12.33 | 58.7 | 8.8 | 30.3 | 0.0 | sunny | 9 | 5 |
In [ ]:
In [77]:
# Rename the column to remove trailing space
nyc_weather.rename(columns={'Temperature ': 'Temperature'}, inplace=True)
# Sort the data by Date
nyc_weather = nyc_weather.sort_values(by='Date')
# Create the bar chart
fig = px.bar(
nyc_weather,
x='Temperature', # Corrected column name
y='Date',
title='Temperature Trend Over Time (Bar Chart)',
labels={'Temperature': 'Temperature (°C)', 'Date': 'Date'},
orientation='h',
color='Temperature', # Corrected column name
color_continuous_scale='Viridis'
)
# Customize the layout
fig.update_layout(
title_font_size=20,
xaxis_title='Temperature (°C)',
yaxis_title='Date',
height=700,
width=1000,
margin=dict(l=50, r=50, t=80, b=100),
font=dict(size=14),
# Generate a plot or visualize data
plot_bgcolor='rgba(240,240,240,1)',
paper_bgcolor='rgba(240,240,240,1)'
)
# Show the chart
# Display the result or DataFrame contents
fig.show()
In [78]:
#Whole Temperature data After cleaning
nyc_weather.columns = nyc_weather.columns.str.strip()
nyc_weather['Temperature']=nyc_weather['Temperature'].round(1)
nyc_weather.head()
Out[78]:
| Date | Temperature | Dew Point | Humidity | Wind speed(mph) | Pressure (in) | Precipitation | Weather_Type | month | day | |
|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 2024-09-01 | 25.22 | 19.89 | 72.6 | 7.6 | 29.9 | 0.0 | sunny | 9 | 1 |
| 2 | 2024-09-02 | 22.61 | 11.11 | 49.2 | 13.2 | 30.0 | 0.0 | sunny | 9 | 2 |
| 3 | 2024-09-03 | 19.33 | 5.56 | 41.8 | 8.3 | 30.3 | 0.0 | sunny | 9 | 3 |
| 4 | 2024-09-04 | 21 | 8.5 | 45.7 | 6.0 | 30.4 | 0.0 | sunny | 9 | 4 |
| 5 | 2024-09-05 | 20.83 | 12.33 | 58.7 | 8.8 | 30.3 | 0.0 | sunny | 9 | 5 |
In [ ]:
In [79]:
# Merge or join two DataFrames--Final Dataset After merging everything
nyc_taxi_with_weather=data_with_new_features.merge(nyc_weather,left_on=['transaction_month','transaction_day'],
right_on=['month','day'],how='left')
print(nyc_taxi_with_weather.shape)
nyc_taxi_with_weather.head()
(76481, 23)
Out[79]:
| PULocationID | transaction_date | transaction_month | transaction_day | transaction_hour | trip_distance | total_amount | count_of_transactions | transaction_week_day | weekend | ... | Date | Temperature | Dew Point | Humidity | Wind speed(mph) | Pressure (in) | Precipitation | Weather_Type | month | day | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 2024-09-01 | 9 | 1 | 5 | 0.00 | 116.00 | 1 | 6 | True | ... | 2024-09-01 | 25.22 | 19.89 | 72.6 | 7.6 | 29.9 | 0.0 | sunny | 9 | 1 |
| 1 | 1 | 2024-09-01 | 9 | 1 | 8 | 12.74 | 52.50 | 1 | 6 | True | ... | 2024-09-01 | 25.22 | 19.89 | 72.6 | 7.6 | 29.9 | 0.0 | sunny | 9 | 1 |
| 2 | 1 | 2024-09-01 | 9 | 1 | 10 | 0.04 | 121.00 | 1 | 6 | True | ... | 2024-09-01 | 25.22 | 19.89 | 72.6 | 7.6 | 29.9 | 0.0 | sunny | 9 | 1 |
| 3 | 1 | 2024-09-01 | 9 | 1 | 11 | 15.63 | 141.96 | 1 | 6 | True | ... | 2024-09-01 | 25.22 | 19.89 | 72.6 | 7.6 | 29.9 | 0.0 | sunny | 9 | 1 |
| 4 | 1 | 2024-09-01 | 9 | 1 | 13 | 0.00 | 81.01 | 1 | 6 | True | ... | 2024-09-01 | 25.22 | 19.89 | 72.6 | 7.6 | 29.9 | 0.0 | sunny | 9 | 1 |
5 rows × 23 columns
In [80]:
nyc_taxi_with_weather.drop(['Date','month','day','LocationID'],axis=1,inplace=True)
In [81]:
# Check for null values in the specified column
nyc_taxi_with_weather.isnull().sum()
Out[81]:
PULocationID 0 transaction_date 0 transaction_month 0 transaction_day 0 transaction_hour 0 trip_distance 0 total_amount 0 count_of_transactions 0 transaction_week_day 0 weekend 0 is_holiday 0 Borough 0 Temperature 0 Dew Point 0 Humidity 0 Wind speed(mph) 0 Pressure (in) 0 Precipitation 0 Weather_Type 0 dtype: int64
In [82]:
nyc_taxi_with_weather.dropna(inplace=True)
print(nyc_taxi_with_weather.shape)
(76481, 19)
In [83]:
# Check for null values in the specified column
nyc_taxi_with_weather.isnull().sum()
Out[83]:
PULocationID 0 transaction_date 0 transaction_month 0 transaction_day 0 transaction_hour 0 trip_distance 0 total_amount 0 count_of_transactions 0 transaction_week_day 0 weekend 0 is_holiday 0 Borough 0 Temperature 0 Dew Point 0 Humidity 0 Wind speed(mph) 0 Pressure (in) 0 Precipitation 0 Weather_Type 0 dtype: int64
In [84]:
nyc_taxi_with_weather.head()
Out[84]:
| PULocationID | transaction_date | transaction_month | transaction_day | transaction_hour | trip_distance | total_amount | count_of_transactions | transaction_week_day | weekend | is_holiday | Borough | Temperature | Dew Point | Humidity | Wind speed(mph) | Pressure (in) | Precipitation | Weather_Type | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 2024-09-01 | 9 | 1 | 5 | 0.00 | 116.00 | 1 | 6 | True | False | EWR | 25.22 | 19.89 | 72.6 | 7.6 | 29.9 | 0.0 | sunny |
| 1 | 1 | 2024-09-01 | 9 | 1 | 8 | 12.74 | 52.50 | 1 | 6 | True | False | EWR | 25.22 | 19.89 | 72.6 | 7.6 | 29.9 | 0.0 | sunny |
| 2 | 1 | 2024-09-01 | 9 | 1 | 10 | 0.04 | 121.00 | 1 | 6 | True | False | EWR | 25.22 | 19.89 | 72.6 | 7.6 | 29.9 | 0.0 | sunny |
| 3 | 1 | 2024-09-01 | 9 | 1 | 11 | 15.63 | 141.96 | 1 | 6 | True | False | EWR | 25.22 | 19.89 | 72.6 | 7.6 | 29.9 | 0.0 | sunny |
| 4 | 1 | 2024-09-01 | 9 | 1 | 13 | 0.00 | 81.01 | 1 | 6 | True | False | EWR | 25.22 | 19.89 | 72.6 | 7.6 | 29.9 | 0.0 | sunny |
In [85]:
#Final Model training
data_for_model=nyc_taxi_with_weather.copy()
In [86]:
categorical_features = ['PULocationID', 'transaction_month', 'transaction_day',
'transaction_hour', 'transaction_week_day', 'weekend', 'is_holiday', 'Borough','Weather_Type']
input_features=categorical_features+['trip_distance','Humidity','Wind speed(mph)','Precipitation','Temperature']
target_feature='total_amount'
In [87]:
#Train_test_Split
# Import required libraries or modules
from sklearn.model_selection import train_test_split
X = data_for_model[input_features]
y = data_for_model[target_feature]
X = pd.get_dummies(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=10)
In [88]:
#Decision Tree
# Import required libraries or modules
from sklearn.tree import DecisionTreeRegressor
decision_tree=DecisionTreeRegressor(max_depth=10)
decision_tree.fit(X_train,y_train)
decision_tree.score(X_test,y_test)
Out[88]:
0.5440474913409042
In [ ]:
In [ ]:
In [ ]:
In [89]:
# Import required libraries or modules-Cross evaluation
from sklearn.model_selection import cross_val_score
print(np.average(cross_val_score(decision_tree,X,y,cv=4)))
0.44656519009400214
In [90]:
# Import required libraries or modules--Random Forest
from sklearn.ensemble import RandomForestRegressor
ran_for=RandomForestRegressor(max_depth=100)
ran_for.fit(X_train,y_train)
ran_for.score(X_test,y_test)
Out[90]:
0.6253728521390483
In [91]:
print(np.average(cross_val_score(ran_for,X,y,cv=4)))
0.4084399751491907
In [92]:
# Import required libraries or modules--Gradient Boosting
from sklearn.ensemble import GradientBoostingRegressor
grad_boost=GradientBoostingRegressor()
grad_boost.fit(X_train,y_train)
grad_boost.score(X_test,y_test)
Out[92]:
0.6097808967547245
In [93]:
#Cross Evaluation of Gradient Boosting
score3=cross_val_score(grad_boost,X,y,cv=4)
print(score3)
print(np.average(score3))
[0.54236789 0.40341494 0.33071653 0.54390088] 0.45510006032010986
In [94]:
# Import required libraries or modules
from sklearn.metrics import mean_squared_error
print("Mean Squared Error:")
y_pred_1=decision_tree.predict(X_test)
print("Decision Tree:",np.sqrt(mean_squared_error(y_test,y_pred_1)))
y_pred_2=ran_for.predict(X_test)
print("Random Forest:",np.sqrt(mean_squared_error(y_test,y_pred_2)))
y_pred_3=grad_boost.predict(X_test)
print("Gradient Boosting:",np.sqrt(mean_squared_error(y_test,y_pred_3)))
Mean Squared Error: Decision Tree: 15.246643336552005 Random Forest: 13.820191445538459 Gradient Boosting: 14.104857390269597
In [95]:
#Hyper parameter Tuning for accuracy increase on Random Forest
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [10,20,50,100,150,200,300,500]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10, 20, 40]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4, 10, 20]
# Method of selecting samples for training each tree
bootstrap = [True, False]
In [96]:
#using Randomized search for faster results
random_grid = {'n_estimators': n_estimators,
'max_features': max_features,
'max_depth': max_depth,
'min_samples_split': min_samples_split,
'min_samples_leaf': min_samples_leaf,
'bootstrap': bootstrap}
In [97]:
# Import required libraries or modules
from sklearn.model_selection import RandomizedSearchCV
rf = RandomForestRegressor()
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 10,
cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)
# Create or manipulate a DataFrame
performances = pd.DataFrame.from_dict(rf_random.cv_results_)
Fitting 3 folds for each of 10 candidates, totalling 30 fits [CV] END bootstrap=True, max_depth=20, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=200; total time= 8.6s [CV] END bootstrap=False, max_depth=100, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=1200; total time= 2.7min
In [98]:
performances=performances.sort_values(by='rank_test_score')
performances
Out[98]:
| mean_fit_time | std_fit_time | mean_score_time | std_score_time | param_n_estimators | param_min_samples_split | param_min_samples_leaf | param_max_features | param_max_depth | param_bootstrap | params | split0_test_score | split1_test_score | split2_test_score | mean_test_score | std_test_score | rank_test_score | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 31.079307 | 0.236847 | 1.682551 | 0.061543 | 200 | 10 | 1 | sqrt | 200 | False | {'n_estimators': 200, 'min_samples_split': 10,... | 0.575766 | 0.589501 | 0.591067 | 0.585445 | 0.006874 | 1 |
| 9 | 68.634153 | 0.715173 | 2.943837 | 0.209944 | 600 | 10 | 2 | sqrt | 500 | False | {'n_estimators': 600, 'min_samples_split': 10,... | 0.578348 | 0.586188 | 0.591214 | 0.585250 | 0.005294 | 2 |
| 5 | 157.843519 | 2.504911 | 6.372407 | 0.356554 | 1200 | 5 | 1 | sqrt | 100 | False | {'n_estimators': 1200, 'min_samples_split': 5,... | 0.564866 | 0.581510 | 0.587448 | 0.577941 | 0.009558 | 3 |
| 7 | 82.204359 | 3.291071 | 4.174740 | 0.894944 | 1400 | 10 | 10 | sqrt | None | True | {'n_estimators': 1400, 'min_samples_split': 10... | 0.548150 | 0.558322 | 0.559302 | 0.555258 | 0.005042 | 4 |
| 2 | 10.958274 | 0.124924 | 0.416695 | 0.006461 | 200 | 40 | 4 | sqrt | 20 | False | {'n_estimators': 200, 'min_samples_split': 40,... | 0.504846 | 0.517145 | 0.512012 | 0.511334 | 0.005044 | 5 |
| 1 | 8.379104 | 0.180452 | 0.383341 | 0.018673 | 200 | 5 | 4 | sqrt | 20 | True | {'n_estimators': 200, 'min_samples_split': 5, ... | 0.502565 | 0.513743 | 0.515682 | 0.510663 | 0.005781 | 6 |
| 6 | 27.816893 | 0.082167 | 0.773720 | 0.024157 | 1400 | 5 | 20 | sqrt | 10 | True | {'n_estimators': 1400, 'min_samples_split': 5,... | 0.398235 | 0.404503 | 0.408053 | 0.403597 | 0.004059 | 7 |
| 3 | 0.043025 | 0.030238 | 0.000000 | 0.000000 | 400 | 40 | 10 | auto | 20 | False | {'n_estimators': 400, 'min_samples_split': 40,... | NaN | NaN | NaN | NaN | NaN | 8 |
| 4 | 0.013686 | 0.003050 | 0.000000 | 0.000000 | 1000 | 20 | 20 | auto | 50 | False | {'n_estimators': 1000, 'min_samples_split': 20... | NaN | NaN | NaN | NaN | NaN | 8 |
| 8 | 0.013402 | 0.001718 | 0.000000 | 0.000000 | 1800 | 10 | 2 | auto | 50 | False | {'n_estimators': 1800, 'min_samples_split': 10... | NaN | NaN | NaN | NaN | NaN | 8 |
In [99]:
#Performing Classification model
nyc_class=nyc_taxi_with_weather.copy()
nyc_class['earning_class']=nyc_class['total_amount'].apply(lambda x:'low' if x<=15 else 'high')
nyc_class['earning_class_binary']=nyc_class['earning_class'].apply(lambda x:0 if x=='low' else 1)
nyc_class.head()
Out[99]:
| PULocationID | transaction_date | transaction_month | transaction_day | transaction_hour | trip_distance | total_amount | count_of_transactions | transaction_week_day | weekend | ... | Borough | Temperature | Dew Point | Humidity | Wind speed(mph) | Pressure (in) | Precipitation | Weather_Type | earning_class | earning_class_binary | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 2024-09-01 | 9 | 1 | 5 | 0.00 | 116.00 | 1 | 6 | True | ... | EWR | 25.22 | 19.89 | 72.6 | 7.6 | 29.9 | 0.0 | sunny | high | 1 |
| 1 | 1 | 2024-09-01 | 9 | 1 | 8 | 12.74 | 52.50 | 1 | 6 | True | ... | EWR | 25.22 | 19.89 | 72.6 | 7.6 | 29.9 | 0.0 | sunny | high | 1 |
| 2 | 1 | 2024-09-01 | 9 | 1 | 10 | 0.04 | 121.00 | 1 | 6 | True | ... | EWR | 25.22 | 19.89 | 72.6 | 7.6 | 29.9 | 0.0 | sunny | high | 1 |
| 3 | 1 | 2024-09-01 | 9 | 1 | 11 | 15.63 | 141.96 | 1 | 6 | True | ... | EWR | 25.22 | 19.89 | 72.6 | 7.6 | 29.9 | 0.0 | sunny | high | 1 |
| 4 | 1 | 2024-09-01 | 9 | 1 | 13 | 0.00 | 81.01 | 1 | 6 | True | ... | EWR | 25.22 | 19.89 | 72.6 | 7.6 | 29.9 | 0.0 | sunny | high | 1 |
5 rows × 21 columns
In [100]:
#Creating a new Target Variable
nyc_class['earning_class'].value_counts()
Out[100]:
earning_class high 72283 low 4198 Name: count, dtype: int64
In [101]:
#Preparing for the model Building
categorical_features = ['PULocationID', 'transaction_month', 'transaction_day',
'transaction_hour', 'transaction_week_day', 'weekend', 'is_holiday', 'Borough','Weather_Type']
input_features = categorical_features + ['trip_distance','Humidity','Wind speed(mph)','Precipitation','Temperature']
target_feature = 'earning_class_binary'
In [102]:
# Import required libraries or modules
from sklearn.model_selection import train_test_split
X_c = nyc_class[input_features]
y_c = nyc_class[target_feature]
X_c = pd.get_dummies(X_c)
X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(X_c, y_c, test_size=0.33, random_state=10)
In [ ]:
In [103]:
X_c.head()
Out[103]:
| transaction_month | transaction_day | transaction_hour | transaction_week_day | weekend | is_holiday | trip_distance | Precipitation | PULocationID_1 | PULocationID_10 | ... | Temperature_21 | Temperature_21.17 | Temperature_21.78 | Temperature_22.11 | Temperature_22.39 | Temperature_22.61 | Temperature_22.78 | Temperature_23.17 | Temperature_24.06 | Temperature_25.22 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 9 | 1 | 5 | 6 | True | False | 0.00 | 0.0 | True | False | ... | False | False | False | False | False | False | False | False | False | True |
| 1 | 9 | 1 | 8 | 6 | True | False | 12.74 | 0.0 | True | False | ... | False | False | False | False | False | False | False | False | False | True |
| 2 | 9 | 1 | 10 | 6 | True | False | 0.04 | 0.0 | True | False | ... | False | False | False | False | False | False | False | False | False | True |
| 3 | 9 | 1 | 11 | 6 | True | False | 15.63 | 0.0 | True | False | ... | False | False | False | False | False | False | False | False | False | True |
| 4 | 9 | 1 | 13 | 6 | True | False | 0.00 | 0.0 | True | False | ... | False | False | False | False | False | False | False | False | False | True |
5 rows × 351 columns
In [104]:
# Import required libraries or modules
#random Forest regression after the Hyper Parameter Tuning
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()
clf.fit(X_train_c, y_train_c)
clf.score(X_test_c,y_test_c)
Out[104]:
0.9581599904909069
In [105]:
print(np.average(cross_val_score(clf,X_c,y_c,cv=4)))
0.9140827516581804
In [106]:
# Create the feature importance DataFrame
importance_df = pd.DataFrame({
'Feature': X_train.columns, # Replace with actual feature names
'Importance': ran_for.feature_importances_ # Extract feature importances
}).sort_values(by='Importance', ascending=False)
# Display the top features for verification
print(importance_df.head(10))
Feature Importance 6 trip_distance 0.476360 2 transaction_hour 0.072732 184 PULocationID_265 0.054567 266 Borough_Queens 0.042112 1 transaction_day 0.025521 3 transaction_week_day 0.015892 131 PULocationID_216 0.015718 8 PULocationID_1 0.010688 264 Borough_EWR 0.008706 265 Borough_Manhattan 0.006942
In [ ]:
In [ ]:
In [107]:
# Import required libraries or modules
from sklearn.model_selection import cross_val_score
# Check accuracy explicitly
accuracy = np.average(cross_val_score(clf, X_c, y_c, cv=4, scoring='accuracy'))
print(f"Cross-validated Accuracy: {accuracy}")
# Check log loss
log_loss = np.average(cross_val_score(clf, X_c, y_c, cv=4, scoring='neg_log_loss'))
print(f"Cross-validated Log Loss: {-log_loss}") # Negate to get positive log loss
Cross-validated Accuracy: 0.9117553410137029 Cross-validated Log Loss: 0.21449803233267062
In [108]:
# Define the target variable as the binary column
target_feature = 'earning_class_binary'
y_c = nyc_class[target_feature]
# Train the classifier
# Import required libraries or modules
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()
clf.fit(X_train_c, y_train_c) # Ensure y_train_c is categorical (e.g., 0 or 1)
print(f"Test Accuracy: {clf.score(X_test_c, y_test_c):.2f}")
Test Accuracy: 0.96
In [109]:
# Import required libraries or modules
from sklearn.metrics import mean_squared_error
# Step 1: Get predictions (using class labels)
y_pred = clf.predict(X_test_c)
# Step 2: Compute MSE
mse = mean_squared_error(y_test_c, y_pred)
print("Mean Squared Error:", mse)
Mean Squared Error: 0.040928721423194266
In [ ]:
In [110]:
# Import required libraries or modules
from sklearn.metrics import mean_absolute_error
# Get predictions
y_pred = clf.predict(X_test_c)
# Compute MAE
mae = mean_absolute_error(y_test_c, y_pred)
print("Mean Absolute Error:", mae)
Mean Absolute Error: 0.040928721423194266
In [111]:
# Import required libraries or modules
from sklearn.metrics import mean_absolute_error, mean_squared_error
# Predictions
y_pred = clf.predict(X_test_c)
# MAE
mae = mean_absolute_error(y_test_c, y_pred)
print("Mean Absolute Error:", mae)
# MSE
mse = mean_squared_error(y_test_c, y_pred)
print("Mean Squared Error:", mse)
Mean Absolute Error: 0.040928721423194266 Mean Squared Error: 0.040928721423194266
In [112]:
# Sort features by importance
top_features = importance_df.head(10) # Display top 10 features
# Plot
plt.figure(figsize=(10, 6))
plt.barh(top_features['Feature'], top_features['Importance'], color='skyblue')
plt.xlabel('Importance')
plt.title('Top 10 Features Importance for Taxi Fare Prediction')
plt.gca().invert_yaxis()
plt.show()
In [113]:
# Import required libraries or modules
# Set threshold for feature importance
threshold = 0.01
# Import required libraries or modules
# Separate important and less important features
# Import required libraries or modules
important_features = importance_df[importance_df['Importance'] >= threshold]
# Import required libraries or modules
other_features = importance_df[importance_df['Importance'] < threshold].sum(numeric_only=True)
# Perform group-by operation or aggregation
# Add aggregated "Other Features"
# Create or manipulate a DataFrame
other_row = pd.DataFrame({'Feature': ['Other Features'], 'Importance': [other_features['Importance']]})
# Import required libraries or modules
important_features = pd.concat([important_features, other_row], ignore_index=True)
# Plot
plt.figure(figsize=(10, 6))
# Import required libraries or modules
plt.barh(important_features['Feature'], important_features['Importance'], color='skyblue')
plt.xlabel('Importance')
plt.title('Feature Importance with Aggregated Minor Features')
plt.gca().invert_yaxis()
# Display the result or DataFrame contents
plt.show()
In [ ]: